This notebook: 1. Reads in the output from pq_parser.ipynb (“pq_metadata.csv”) 2. Cleans the output from pq_parser.ipynb 3. Writes cleaned corpus (“01_pq_metaclean.csv”)
#load data
pq_metadata <- data.table::fread('Data/02_Working/pq_metadata.csv')
# displays column names & number of rows - 6589
names(pq_metadata);nrow(pq_metadata)
[1] "Title" "Publication title" "Publication year" "Document URL" "Full text"
[6] "Links" "Section" "Publication subject" "ISSN" "Copyright"
[11] "Abstract" "Publication info" "Last updated" "Place of publication" "Location"
[16] "Author" "Publisher" "Identifier / keyword" "Source type" "ProQuest document ID"
[21] "Country of publication" "Language of publication" "Publication date" "Subject" "Database"
[26] "Document type"
[1] 6589
# Head displays the first 6 rows of the data.table
head(pq_metadata)
# How do you want to treat NA?
# subset the data by the NAs, explore them & their origin
# can we programmatically fix it, or do we have to do it manually?
# 51 rows blank/NA Full text
pq_empty<-pq_metadata[pq_metadata$`Full text` == "" | is.na(pq_metadata$`Full text`),]
nrow(pq_empty)
[1] 51
# How do you want to treat NA?
# subset the data by the NAs, explore them & their origin
# can we programmatically fix it, or do we have to do it manually?
# 50 blank/NA pub titles
#pq_metaclean<-pq_metadata[!(pq_metadata$`Full text` == "" | is.na(pq_metadata$`Full text`)),]
pq_metaclean <- pq_metadata %>%
drop_na(`Full text`) %>%
drop_na(`ProQuest document ID`) %>%
filter(`Full text` != "") %>%
filter(`ProQuest document ID` != "")
numReduced<- nrow(pq_metadata)-nrow(pq_metaclean)
print(paste("NA or blank Full Text rows removed:",numReduced))
[1] "NA or blank Full Text rows removed: 128"
# what are the unique publication titles
unique(pq_metaclean$`Publication title`)
[1] "Bangor Daily News; Bangor, Me." "Bangor Daily News; Bango r, Me." "Bangor Daily News; Bang or, Me."
[4] "Bang or Daily News; Bangor, Me." "Kennebec Journal; Augusta, Me." "Maine Times; Portland, Me."
[7] "Morning Sentinel; Waterville, Me." "Portland Press Herald; Portland, Me." "Portland Press Herald; Port land, Me."
[10] "Portland Pr ess Herald; Portland, Me." "Portland Pre ss Herald; Portland, Me." "Portl and Press Herald; Portland, Me."
[13] "Portland Press Herald Portland, Me." "Portlan d Press Herald; Portland, Me." "Port land Press Herald; Portland, Me."
[16] "" "Sun Journal; Lewiston, Me."
# How do you want to treat NA?
# subset the data by the NAs, explore them & their origin
# can we programmatically fix it, or do we have to do it manually?
# 15 blank/NA pub titles (there were 50 before Full text clean)
pq_empty<-pq_metaclean[pq_metaclean$`Publication title` == "" | is.na(pq_metaclean$`Publication title`),]
nrow(pq_empty)
[1] 1
# 0 of these blank/NA pub titles have blank/NA 'full texts' as well (there were 37 before full text clean)
nrow(pq_empty[pq_empty$`Full text` == "" | is.na(pq_empty$`Full text`),])
[1] 0
pq_metaclean <- pq_metaclean %>%
mutate(`Publication title` = ifelse(`Publication title` == "" | is.na(`Publication title`),
unlist(strsplit(`Publication info`, split="\\[|:"))[1], `Publication title`))
#check to see if pub title substitution worked
unique(pq_metaclean$`Publication title`)
[1] "Bangor Daily News; Bangor, Me." "Bangor Daily News; Bango r, Me." "Bangor Daily News; Bang or, Me."
[4] "Bang or Daily News; Bangor, Me." "Kennebec Journal; Augusta, Me." "Maine Times; Portland, Me."
[7] "Morning Sentinel; Waterville, Me." "Portland Press Herald; Portland, Me." "Portland Press Herald; Port land, Me."
[10] "Portland Pr ess Herald; Portland, Me." "Portland Pre ss Herald; Portland, Me." "Portl and Press Herald; Portland, Me."
[13] "Portland Press Herald Portland, Me." "Portlan d Press Herald; Portland, Me." "Port land Press Herald; Portland, Me."
[16] "Bangor Daily News ; Bangor, Me. " "Sun Journal; Lewiston, Me."
# Substitute
# "Bangor Daily News; Bang or, Me."
# "Bangor Daily News; Ba ngor, Me."
# "Bangor Dail y News; Bangor, Me."
# with "Bangor Daily News; Bangor, Me."
# "Morning Sentinel; Wate rville, Me."
# "Central Maine Morning Sentinel; Waterville, Me."
# with "Morning Sentinel; Waterville, Me."
# "Kennebec Journal; Augusta, Me."
# "Kennebec Journal; Augusta, Me ."
pq_metaclean <- pq_metaclean %>%
mutate(`Publication title` = recode(`Publication title`,
'Bangor Daily News; Bang or, Me.' = 'Bangor Daily News; Bangor, Me.',
'Bangor Daily News; Ba ngor, Me.' = 'Bangor Daily News; Bangor, Me.',
'Bangor Dail y News; Bangor, Me.' = 'Bangor Daily News; Bangor, Me.',
'Bangor Daily News; Bango r, Me.' = 'Bangor Daily News; Bangor, Me.',
'Bang or Daily News; Bangor, Me.' = 'Bangor Daily News; Bangor, Me.',
"Bangor Daily News ; Bangor, Me. " = 'Bangor Daily News; Bangor, Me.',
'Morning Sentinel; Wate rville, Me.' = 'Morning Sentinel; Waterville, Me.',
'Central Maine Morning Sentinel; Waterville, Me.' = 'Morning Sentinel; Waterville, Me.',
'Kennebec Journal; Augusta, Me .' = 'Kennebec Journal; Augusta, Me.',
'Portland Press Herald; Port land, Me.' = 'Portland Press Herald; Portland, Me.',
'Portland Pr ess Herald; Portland, Me.' = 'Portland Press Herald; Portland, Me.',
'Portland Pre ss Herald; Portland, Me.' = 'Portland Press Herald; Portland, Me.',
'Portl and Press Herald; Portland, Me.' = 'Portland Press Herald; Portland, Me.',
'Portland Press Herald Portland, Me.' = 'Portland Press Herald; Portland, Me.',
'Portlan d Press Herald; Portland, Me.' = 'Portland Press Herald; Portland, Me.',
'Port land Press Herald; Portland, Me.' = 'Portland Press Herald; Portland, Me.',
'Portlan d Press Herald; Portland, Me.' = 'Portland Press Herald; Portland, Me.',
'Port land Press Herald; Portland, Me.' = 'Portland Press Herald; Portland, Me.'))
#check to see if pub title substitution worked --- yes, no weird repeats
unique(pq_metaclean$`Publication title`)
[1] "Bangor Daily News; Bangor, Me." "Kennebec Journal; Augusta, Me." "Maine Times; Portland, Me." "Morning Sentinel; Waterville, Me."
[5] "Portland Press Herald; Portland, Me." "Sun Journal; Lewiston, Me."
#check unique Publication years
unique(pq_metaclean$'Publication year')
[1] "1997" "1996" "1995" "1994" "1993" "2004" "2003" "200 3" "2002" "2001" "2009" "2008" "2007" "2006" "2005" "2014" "2013" "2012" "2011"
[20] "2010" "2019" "2018" "2017" "2016" "2015" "2000" "1999" "1998" "201 8" ""
#clean publication years
pq_metaclean <- pq_metaclean %>%
mutate(`Publication year` = recode(`Publication year`,
'200 3' = '2003',
'201 8' = '2018'))
#check to see if pub title substitution worked --- yes, no weird repeats
unique(pq_metaclean$`Publication year`)
[1] "1997" "1996" "1995" "1994" "1993" "2004" "2003" "2002" "2001" "2009" "2008" "2007" "2006" "2005" "2014" "2013" "2012" "2011" "2010" "2019" "2018" "2017"
[23] "2016" "2015" "2000" "1999" "1998" ""
# How do you want to treat NA?
# subset the data by the NAs, explore them & their origin
# can we programmatically fix it, or do we have to do it manually?
# 3 blank/NA pub years
pq_empty<-pq_metaclean[pq_metaclean$`Publication year` == "" | is.na(pq_metaclean$`Publication year`),]
nrow(pq_empty)
[1] 3
# 0 of these blank/NA pub years have blank/NA 'full texts' as well
nrow(pq_empty[pq_empty$`Full text` == "" | is.na(pq_empty$`Full text`),])
[1] 0
#pq_metaclean <- pq_metaclean %>%
# mutate(`Publication year` = ifelse(`Publication year` == "" | is.na(`Publication year`), str_sub(`Publication date`,-4,-1), `Publication year`))
# Sub from last 4 digits in publication date
# sub from matched condition of being 4 digits in publication info
pq_metaclean <- pq_metaclean %>%
mutate(`Publication year` = ifelse(`Publication year` == "" | is.na(`Publication year`),
sub('.*(\\d{4}).*', '\\1', `Publication info`), `Publication year`))
#check to see if pub title substitution worked --- yes, no weird repeats
unique(pq_metaclean$`Publication year`)
[1] "1997" "1996" "1995" "1994" "1993" "2004" "2003" "2002" "2001" "2009" "2008" "2007" "2006" "2005" "2014" "2013" "2012" "2011" "2010" "2019" "2018" "2017"
[23] "2016" "2015" "2000" "1999" "1998"
# How do you want to treat NA?
# subset the data by the NAs, explore them & their origin
# can we programmatically fix it, or do we have to do it manually?
# 3 blank/NA pub years
pq_empty<-pq_metaclean[pq_metaclean$`Publication year` == "" | is.na(pq_metaclean$`Publication year`),]
nrow(pq_empty)
[1] 0
# 0 of these blank/NA pub years have blank/NA 'full texts' as well
nrow(pq_empty[pq_empty$`Full text` == "" | is.na(pq_empty$`Full text`),])
[1] 0
# How do you want to treat NA?
# subset the data by the NAs, explore them & their origin
# can we programmatically fix it, or do we have to do it manually?
# 3 blank/NA pub years
pq_empty<-pq_metaclean[pq_metaclean$`Publication date` == "" | is.na(pq_metaclean$`Publication date`),]
nrow(pq_empty)
[1] 2
# 0 of these blank/NA pub years have blank/NA 'full texts' as well
nrow(pq_empty[pq_empty$`Full text` == "" | is.na(pq_empty$`Full text`),])
[1] 0
# if publication date is empty or NA, fill with date from publication info
pq_metaclean <- pq_metaclean %>%
mutate(`Publication date` = ifelse(`Publication date` == "" | is.na(`Publication date`),
unlist(strsplit(`Publication info`, split="\\]|:"))[2], `Publication date`))
# How do you want to treat NA?
# subset the data by the NAs, explore them & their origin
# can we programmatically fix it, or do we have to do it manually?
# 3 blank/NA pub years
pq_empty<-pq_metaclean[pq_metaclean$`Publication date` == "" | is.na(pq_metaclean$`Publication date`),]
nrow(pq_empty)
[1] 0
# 0 of these blank/NA pub years have blank/NA 'full texts' as well
nrow(pq_empty[pq_empty$`Full text` == "" | is.na(pq_empty$`Full text`),])
[1] 0
PubTitles<-"Bangor Daily News|Kennebec Journal|Maine Times|Morning Sentinel|Portland Press Herald|Sun Journal"
TxtFormatting<-"\a|\b|\f|\n|\r|\t|\v|\\[|\\]"
pq_metaclean<- pq_metaclean %>%
mutate(`Full text` = textclean::replace_html(`Full text`, symbol=TRUE)) %>% # replace html, url, symbol, white from textclean
mutate(`Full text` = textclean::replace_url(`Full text`, replacement = '<<URL>>')) %>%
mutate(`Full text` = gsub(PubTitles, " ", `Full text`)) %>% # remove publication titles
mutate(`Full text` = gsub(TxtFormatting, " ", `Full text`)) %>% # remove text formatting
mutate(`Full text` = gsub("\"", " ", `Full text`)) %>% # remove double quotes ""
mutate(`Full text` = stringi::stri_trans_general(str = `Full text`, id = "Latin-ASCII")) %>% # remove accented letters
mutate(`Full text` = custom_replace_symbol(`Full text`)) %>% # replace "&" with "and" and "%" with "percent"
mutate(`Full text` = textclean::replace_white(`Full text`)) # remove extra white space
These document IDs have random spaces in the publication date; i.e., May 01, 20 14 DID1<-“1540791926” DID2<-“2257133415”
pq_metadata[pq_metadata$ProQuest document ID==DID1,]
#
head(pq_metaclean)
sort(pq_metaclean$`Publication date`)
[1] "22 Nov 1997" "22 Nov 1997" "Apr 08, 1994" "Apr 08, 2011" "Apr 1, 1994" "Apr 1, 1997" "Apr 1, 1997" "Apr 1, 1998" "Apr 1, 2001" "Apr 1, 2003"
[11] "Apr 1, 2005" "Apr 1, 2006" "Apr 1, 2007" "Apr 1, 2007" "Apr 1, 2010" "Apr 1, 2012" "Apr 1, 2014" "Apr 1, 2014" "Apr 1, 2014" "Apr 1, 2014"
[21] "Apr 1, 2014" "Apr 1, 2018" "Apr 1, 2018" "Apr 10, 1996" "Apr 10, 1996" "Apr 10, 1997" "Apr 10, 1998" "Apr 10, 2000" "Apr 10, 2001" "Apr 10, 2003"
[31] "Apr 10, 2004" "Apr 10, 2007" "Apr 10, 2011" "Apr 10, 2012" "Apr 10, 2014" "Apr 10, 2015" "Apr 10, 2016" "Apr 10, 2019" "Apr 10, 2019" "Apr 11, 1994"
[41] "Apr 11, 1996" "Apr 11, 1996" "Apr 11, 1998" "Apr 11, 2001" "Apr 11, 2002" "Apr 11, 2006" "Apr 11, 2007" "Apr 11, 2008" "Apr 11, 2013" "Apr 11, 2016"
[51] "Apr 11, 2017" "Apr 11, 2018" "Apr 11, 2018" "Apr 11, 2018" "Apr 11, 2019" "Apr 12, 1994" "Apr 12, 1997" "Apr 12, 1997" "Apr 12, 1997" "Apr 12, 2002"
[61] "Apr 12, 2004" "Apr 12, 2010" "Apr 12, 2012" "Apr 12, 2012" "Apr 12, 2012" "Apr 12, 2013" "Apr 12, 2016" "Apr 12, 2017" "Apr 12, 2017" "Apr 13, 1994"
[71] "Apr 13, 1996" "Apr 13, 1996" "Apr 13, 2001" "Apr 13, 2004" "Apr 13, 2004" "Apr 13, 2005" "Apr 13, 2008" "Apr 13, 2008" "Apr 13, 2008" "Apr 13, 2013"
[81] "Apr 13, 2014" "Apr 13, 2014" "Apr 13, 2015" "Apr 13, 2016" "Apr 13, 2016" "Apr 13, 2017" "Apr 13, 2019" "Apr 14, 1995" "Apr 14, 1996" "Apr 14, 1997"
[91] "Apr 14, 1999" "Apr 14, 2000" "Apr 14, 2001" "Apr 14, 2002" "Apr 14, 2003" "Apr 14, 2003" "Apr 14, 2010" "Apr 14, 2010" "Apr 14, 2012" "Apr 14, 2016"
[101] "Apr 14, 2019" "Apr 15, 1997" "Apr 15, 1997" "Apr 15, 1997" "Apr 15, 1998" "Apr 15, 1998" "Apr 15, 1998" "Apr 15, 2001" "Apr 15, 2003" "Apr 15, 2003"
[111] "Apr 15, 2006" "Apr 15, 2015" "Apr 15, 2016" "Apr 15, 2016" "Apr 15, 2018" "Apr 16, 1997" "Apr 16, 1998" "Apr 16, 1999" "Apr 16, 2003" "Apr 16, 2005"
[121] "Apr 16, 2007" "Apr 16, 2008" "Apr 16, 2008" "Apr 16, 2015" "Apr 16, 2016" "Apr 16, 2019" "Apr 16, 2019" "Apr 16, 2019" "Apr 17, 1996" "Apr 17, 1996"
[131] "Apr 17, 1996" "Apr 17, 1998" "Apr 17, 2001" "Apr 17, 2001" "Apr 17, 2002" "Apr 17, 2004" "Apr 17, 2013" "Apr 17, 2014" "Apr 17, 2015" "Apr 17, 2016"
[141] "Apr 17, 2017" "Apr 17, 2017" "Apr 17, 2017" "Apr 17, 2017" "Apr 17, 2018" "Apr 17, 2018" "Apr 17, 2018" "Apr 17, 2019" "Apr 17, 2019" "Apr 17, 2019"
[151] "Apr 17, 2019" "Apr 17, 2019" "Apr 18, 1999" "Apr 18, 2001" "Apr 18, 2003" "Apr 18, 2007" "Apr 18, 2007" "Apr 18, 2007" "Apr 18, 2010" "Apr 18, 2013"
[161] "Apr 18, 2014" "Apr 18, 2017" "Apr 18, 2018" "Apr 18, 2018" "Apr 18, 2019" "Apr 18, 2019" "Apr 19, 1994" "Apr 19, 1994" "Apr 19, 1996" "Apr 19, 1997"
[171] "Apr 19, 1998" "Apr 19, 1998" "Apr 19, 1998" "Apr 19, 1998" "Apr 19, 1998" "Apr 19, 2000" "Apr 19, 2006" "Apr 19, 2007" "Apr 19, 2011" "Apr 19, 2012"
[181] "Apr 19, 2013" "Apr 19, 2017" "Apr 2, 1997" "Apr 2, 1999" "Apr 2, 2000" "Apr 2, 2003" "Apr 2, 2003" "Apr 2, 2007" "Apr 2, 2009" "Apr 2, 2009"
[191] "Apr 2, 2009" "Apr 2, 2009" "Apr 2, 2010" "Apr 2, 2012" "Apr 2, 2014" "Apr 2, 2014" "Apr 2, 2014" "Apr 2, 2014" "Apr 2, 2015" "Apr 2, 2017"
[201] "Apr 2, 2019" "Apr 20, 1997" "Apr 20, 1997" "Apr 20, 1997" "Apr 20, 1997" "Apr 20, 1999" "Apr 20, 2000" "Apr 20, 2000" "Apr 20, 2002" "Apr 20, 2004"
[211] "Apr 20, 2005" "Apr 20, 2006" "Apr 20, 2006" "Apr 20, 2010" "Apr 20, 2011" "Apr 20, 2013" "Apr 20, 2014" "Apr 20, 2016" "Apr 20, 2019" "Apr 20, 2019"
[221] "Apr 21, 1996" "Apr 21, 1998" "Apr 21, 1999" "Apr 21, 2000" "Apr 21, 2003" "Apr 21, 2007" "Apr 21, 2010" "Apr 21, 2010" "Apr 21, 2015" "Apr 21, 2017"
[231] "Apr 21, 2019" "Apr 22, 1995" "Apr 22, 1996" "Apr 22, 1997" "Apr 22, 1998" "Apr 22, 1998" "Apr 22, 2000" "Apr 22, 2004" "Apr 22, 2004" "Apr 22, 2015"
[241] "Apr 22, 2017" "Apr 22, 2017" "Apr 22, 2018" "Apr 22, 2018" "Apr 23, 1997" "Apr 23, 1997" "Apr 23, 1997" "Apr 23, 1999" "Apr 23, 2000" "Apr 23, 2000"
[251] "Apr 23, 2000" "Apr 23, 2001" "Apr 23, 2004" "Apr 23, 2004" "Apr 23, 2005" "Apr 23, 2007" "Apr 23, 2009" "Apr 23, 2012" "Apr 23, 2013" "Apr 23, 2014"
[261] "Apr 23, 2014" "Apr 23, 2016" "Apr 23, 2017" "Apr 23, 2017" "Apr 24, 1995" "Apr 24, 1996" "Apr 24, 1996" "Apr 24, 1997" "Apr 24, 1998" "Apr 24, 1999"
[271] "Apr 24, 2000" "Apr 24, 2002" "Apr 24, 2004" "Apr 24, 2010" "Apr 24, 2012" "Apr 24, 2016" "Apr 24, 2016" "Apr 24, 2016" "Apr 24, 2016" "Apr 25, 1998"
[281] "Apr 25, 1999" "Apr 25, 2000" "Apr 25, 2000" "Apr 25, 2000" "Apr 25, 2000" "Apr 25, 2000" "Apr 25, 2003" "Apr 25, 2007" "Apr 25, 2011" "Apr 25, 2016"
[291] "Apr 25, 2016" "Apr 25, 2016" "Apr 25, 2018" "Apr 25, 2019" "Apr 25, 2019" "Apr 26, 1995" "Apr 26, 1998" "Apr 26, 1999" "Apr 26, 2000" "Apr 26, 2000"
[301] "Apr 26, 2004" "Apr 26, 2006" "Apr 26, 2007" "Apr 26, 2011" "Apr 26, 2013" "Apr 26, 2013" "Apr 26, 2016" "Apr 26, 2016" "Apr 26, 2016" "Apr 26, 2016"
[311] "Apr 26, 2016" "Apr 27, 1995" "Apr 27, 1997" "Apr 27, 1998" "Apr 27, 2000" "Apr 27, 2004" "Apr 27, 2004" "Apr 27, 2013" "Apr 27, 2015" "Apr 27, 2016"
[321] "Apr 27, 2016" "Apr 27, 2016" "Apr 27, 2016" "Apr 27, 2016" "Apr 27, 2017" "Apr 27, 2019" "Apr 28, 1999" "Apr 28, 1999" "Apr 28, 1999" "Apr 28, 2000"
[331] "Apr 28, 2000" "Apr 28, 2001" "Apr 28, 2001" "Apr 28, 2001" "Apr 28, 2004" "Apr 28, 2006" "Apr 28, 2010" "Apr 28, 2012" "Apr 28, 2015" "Apr 28, 2016"
[341] "Apr 29, 1994" "Apr 29, 1997" "Apr 29, 1998" "Apr 29, 1998" "Apr 29, 2002" "Apr 29, 2003" "Apr 29, 2008" "Apr 29, 2013" "Apr 29, 2014" "Apr 29, 2014"
[351] "Apr 29, 2014" "Apr 29, 2015" "Apr 29, 2015" "Apr 29, 2015" "Apr 29, 2016" "Apr 29, 2018" "Apr 29, 2018" "Apr 3, 1996" "Apr 3, 2000" "Apr 3, 2004"
[361] "Apr 3, 2008" "Apr 3, 2014" "Apr 3, 2014" "Apr 3, 2015" "Apr 3, 2016" "Apr 3, 2019" "Apr 3, 2019" "Apr 3, 2019" "Apr 30, 1994" "Apr 30, 1997"
[371] "Apr 30, 1998" "Apr 30, 2000" "Apr 30, 2000" "Apr 30, 2002" "Apr 30, 2005" "Apr 30, 2010" "Apr 30, 2010" "Apr 30, 2014" "Apr 30, 2015" "Apr 30, 2015"
[381] "Apr 30, 2017" "Apr 30, 2019" "Apr 4, 1994" "Apr 4, 1996" "Apr 4, 1998" "Apr 4, 1999" "Apr 4, 2000" "Apr 4, 2001" "Apr 4, 2007" "Apr 4, 2014"
[391] "Apr 4, 2014" "Apr 4, 2014" "Apr 4, 2015" "Apr 4, 2018" "Apr 5, 1995" "Apr 5, 1997" "Apr 5, 1998" "Apr 5, 1998" "Apr 5, 1999" "Apr 5, 2002"
[401] "Apr 5, 2006" "Apr 5, 2007" "Apr 5, 2009" "Apr 5, 2010" "Apr 5, 2016" "Apr 5, 2017" "Apr 5, 2018" "Apr 5, 2019" "Apr 6, 1998" "Apr 6, 1998"
[411] "Apr 6, 2001" "Apr 6, 2005" "Apr 6, 2006" "Apr 6, 2008" "Apr 6, 2009" "Apr 6, 2009" "Apr 6, 2010" "Apr 6, 2010" "Apr 6, 2011" "Apr 6, 2011"
[421] "Apr 6, 2011" "Apr 6, 2011" "Apr 6, 2011" "Apr 6, 2014" "Apr 6, 2014" "Apr 6, 2014" "Apr 6, 2014" "Apr 6, 2014" "Apr 6, 2014" "Apr 6, 2014"
[431] "Apr 6, 2015" "Apr 6, 2016" "Apr 6, 2019" "Apr 7, 1997" "Apr 7, 1998" "Apr 7, 1999" "Apr 7, 2002" "Apr 7, 2004" "Apr 7, 2006" "Apr 7, 2007"
[441] "Apr 7, 2008" "Apr 7, 2008" "Apr 7, 2009" "Apr 7, 2009" "Apr 7, 2014" "Apr 7, 2015" "Apr 7, 2019" "Apr 8, 1994" "Apr 8, 1994" "Apr 8, 1997"
[451] "Apr 8, 1998" "Apr 8, 1998" "Apr 8, 1998" "Apr 8, 1999" "Apr 8, 1999" "Apr 8, 1999" "Apr 8, 2000" "Apr 8, 2004" "Apr 8, 2005" "Apr 8, 2008"
[461] "Apr 8, 201 4" "Apr 8, 2010" "Apr 8, 2010" "Apr 8, 2014" "Apr 8, 2014" "Apr 8, 2015" "Apr 8, 2015" "Apr 8, 2016" "Apr 8, 2018" "Apr 8, 2019"
[471] "Apr 9, 1996" "Apr 9, 1997" "Apr 9, 1998" "Apr 9, 1999" "Apr 9, 1999" "Apr 9, 1999" "Apr 9, 2002" "Apr 9, 2005" "Apr 9, 2006" "Apr 9, 2007"
[481] "Apr 9, 2011" "Apr 9, 2013" "Apr 9, 2015" "Apr 9, 2016" "Apr 9, 2016" "Apr 9, 2016" "Apr 9, 2018" "Apr 9, 2019" "Aug 03, 1996" "Aug 1, 1994"
[491] "Aug 1, 1994" "Aug 1, 1998" "Aug 1, 1998" "Aug 1, 2001" "Aug 1, 2003" "Aug 1, 2005" "Aug 1, 2006" "Aug 1, 2007" "Aug 1, 2008" "Aug 1, 2008"
[501] "Aug 1, 2008" "Aug 1, 2009" "Aug 1, 2009" "Aug 1, 2010" "Aug 1, 2010" "Aug 1, 2012" "Aug 1, 2014" "Aug 1, 2017" "Aug 1, 2017" "Aug 1, 2018"
[511] "Aug 1, 2018" "Aug 1, 2019" "Aug 1, 2019" "Aug 1, 2019" "Aug 10, 1995" "Aug 10, 1999" "Aug 10, 1999" "Aug 10, 1999" "Aug 10, 2000" "Aug 10, 2003"
[521] "Aug 10, 2004" "Aug 10, 2004" "Aug 10, 2005" "Aug 10, 2006" "Aug 10, 2010" "Aug 10, 2014" "Aug 10, 2015" "Aug 10, 2016" "Aug 10, 2016" "Aug 10, 2017"
[531] "Aug 11, 1994" "Aug 11, 1995" "Aug 11, 1996" "Aug 11, 1996" "Aug 11, 1997" "Aug 11, 1998" "Aug 11, 1999" "Aug 11, 2000" "Aug 11, 2005" "Aug 11, 2006"
[541] "Aug 11, 2008" "Aug 11, 2010" "Aug 11, 2011" "Aug 11, 2011" "Aug 11, 2012" "Aug 11, 2016" "Aug 11, 2017" "Aug 11, 2017" "Aug 12, 1993" "Aug 12, 1996"
[551] "Aug 12, 1997" "Aug 12, 1998" "Aug 12, 1998" "Aug 12, 1999" "Aug 12, 1999" "Aug 12, 2002" "Aug 12, 2004" "Aug 12, 2008" "Aug 12, 2014" "Aug 12, 2015"
[561] "Aug 12, 2017" "Aug 13, 1993" "Aug 13, 1999" "Aug 13, 1999" "Aug 13, 2000" "Aug 13, 2004" "Aug 13, 2004" "Aug 13, 2005" "Aug 13, 2008" "Aug 13, 2010"
[571] "Aug 13, 2012" "Aug 13, 2013" "Aug 13, 2015" "Aug 13, 2018" "Aug 13, 2018" "Aug 13, 2018" "Aug 14, 1996" "Aug 14, 2001" "Aug 14, 2001" "Aug 14, 2002"
[581] "Aug 14, 2002" "Aug 14, 2003" "Aug 14, 2008" "Aug 14, 2010" "Aug 14, 2014" "Aug 14, 2014" "Aug 14, 2014" "Aug 14, 2014" "Aug 14, 2014" "Aug 14, 2014"
[591] "Aug 14, 2014" "Aug 14, 2014" "Aug 14, 2016" "Aug 14, 2017" "Aug 14, 2017" "Aug 14, 2018" "Aug 15, 1996" "Aug 15, 1997" "Aug 15, 2002" "Aug 15, 2004"
[601] "Aug 15, 2004" "Aug 15, 2007" "Aug 15, 2008" "Aug 15, 2008" "Aug 15, 2009" "Aug 15, 2011" "Aug 15, 2012" "Aug 15, 2013" "Aug 15, 2013" "Aug 15, 2013"
[611] "Aug 15, 2014" "Aug 15, 2014" "Aug 15, 2016" "Aug 15, 2018" "Aug 15, 2019" "Aug 16, 1997" "Aug 16, 1999" "Aug 16, 2000" "Aug 16, 2001" "Aug 16, 2003"
[621] "Aug 16, 2003" "Aug 16, 2005" "Aug 16, 2006" "Aug 16, 2010" "Aug 16, 2010" "Aug 16, 2012" "Aug 16, 2015" "Aug 16, 2015" "Aug 16, 2017" "Aug 16, 2017"
[631] "Aug 16, 2017" "Aug 16, 2017" "Aug 16, 2019" "Aug 16, 2019" "Aug 16, 2019" "Aug 17, 2000" "Aug 17, 2001" "Aug 17, 2009" "Aug 17, 2011" "Aug 17, 2011"
[641] "Aug 17, 2011" "Aug 17, 2012" "Aug 17, 2014" "Aug 17, 2014" "Aug 17, 2016" "Aug 17, 2017" "Aug 17, 2019" "Aug 18, 1998" "Aug 18, 1999" "Aug 18, 1999"
[651] "Aug 18, 1999" "Aug 18, 2000" "Aug 18, 2008" "Aug 18, 2011" "Aug 18, 2014" "Aug 18, 2016" "Aug 18, 2017" "Aug 18, 2019" "Aug 18, 2019" "Aug 18, 2019"
[661] "Aug 18, 2019" "Aug 19, 1998" "Aug 19, 1998" "Aug 19, 1998" "Aug 19, 1998" "Aug 19, 1998" "Aug 19, 2000" "Aug 19, 2000" "Aug 19, 2002" "Aug 19, 2004"
[671] "Aug 19, 2005" "Aug 19, 2008" "Aug 19, 2009" "Aug 19, 2009" "Aug 19, 2010" "Aug 19, 2010" "Aug 19, 2010" "Aug 19, 2011" "Aug 19, 2011" "Aug 19, 2012"
[681] "Aug 19, 2014" "Aug 19, 2014" "Aug 19, 2015" "Aug 19, 2016" "Aug 19, 2016" "Aug 19, 2019" "Aug 19, 2019" "Aug 19, 2019" "Aug 19, 2019" "Aug 2, 1997"
[691] "Aug 2, 1997" "Aug 2, 1998" "Aug 2, 2000" "Aug 2, 2000" "Aug 2, 2000" "Aug 2, 2003" "Aug 2, 2003" "Aug 2, 2004" "Aug 2, 2007" "Aug 2, 2008"
[701] "Aug 2, 2014" "Aug 2, 2015" "Aug 2, 2015" "Aug 2, 2015" "Aug 2, 2017" "Aug 2, 2018" "Aug 20, 1996" "Aug 20, 1996" "Aug 20, 1996" "Aug 20, 1997"
[711] "Aug 20, 1997" "Aug 20, 1998" "Aug 20, 1998" "Aug 20, 1999" "Aug 20, 2003" "Aug 20, 2003" "Aug 20, 2005" "Aug 20, 2006" "Aug 20, 2012" "Aug 20, 2013"
[721] "Aug 20, 2014" "Aug 20, 2014" "Aug 20, 2015" "Aug 20, 2019" "Aug 20, 2019" "Aug 21, 1993" "Aug 21, 1996" "Aug 21, 1997" "Aug 21, 1997" "Aug 21, 1999"
[731] "Aug 21, 1999" "Aug 21, 2000" "Aug 21, 2000" "Aug 21, 2003" "Aug 21, 2006" "Aug 21, 2008" "Aug 21, 2008" "Aug 21, 2009" "Aug 21, 2013" "Aug 21, 2014"
[741] "Aug 21, 2018" "Aug 21, 2018" "Aug 21, 2018" "Aug 21, 2018" "Aug 21, 2019" "Aug 21, 2019" "Aug 21, 2019" "Aug 21, 2019" "Aug 21, 2019" "Aug 22, 1994"
[751] "Aug 22, 1998" "Aug 22, 1998" "Aug 22, 2001" "Aug 22, 2003" "Aug 22, 2008" "Aug 22, 2011" "Aug 22, 2016" "Aug 22, 2018" "Aug 22, 2018" "Aug 22, 2018"
[761] "Aug 22, 2019" "Aug 23, 1996" "Aug 23, 1997" "Aug 23, 1997" "Aug 23, 1998" "Aug 23, 1999" "Aug 23, 2000" "Aug 23, 2001" "Aug 23, 2004" "Aug 23, 2005"
[771] "Aug 23, 2005" "Aug 23, 2005" "Aug 23, 2006" "Aug 23, 2006" "Aug 23, 2006" "Aug 23, 2007" "Aug 23, 2010" "Aug 23, 2013" "Aug 23, 2013" "Aug 23, 2016"
[781] "Aug 23, 2016" "Aug 23, 2018" "Aug 23, 2019" "Aug 23, 2019" "Aug 23, 2019" "Aug 24, 1993" "Aug 24, 1994" "Aug 24, 1997" "Aug 24, 1998" "Aug 24, 1998"
[791] "Aug 24, 1999" "Aug 24, 1999" "Aug 24, 2000" "Aug 24, 2000" "Aug 24, 2002" "Aug 24, 2005" "Aug 24, 2005" "Aug 24, 2005" "Aug 24, 2007" "Aug 24, 2008"
[801] "Aug 24, 2008" "Aug 24, 2009" "Aug 24, 2011" "Aug 24, 2013" "Aug 24, 2014" "Aug 24, 2016" "Aug 24, 2017" "Aug 24, 2017" "Aug 24, 2017" "Aug 24, 2017"
[811] "Aug 24, 2019" "Aug 24, 2019" "Aug 25, 1995" "Aug 25, 1997" "Aug 25, 1998" "Aug 25, 1998" "Aug 25, 1999" "Aug 25, 2003" "Aug 25, 2003" "Aug 25, 2005"
[821] "Aug 25, 2006" "Aug 25, 2008" "Aug 25, 2008" "Aug 25, 2009" "Aug 25, 2012" "Aug 25, 2016" "Aug 25, 2016" "Aug 25, 2016" "Aug 26, 1995" "Aug 26, 1997"
[831] "Aug 26, 1998" "Aug 26, 2000" "Aug 26, 2001" "Aug 26, 2003" "Aug 26, 2008" "Aug 26, 2008" "Aug 26, 2008" "Aug 26, 2009" "Aug 26, 2009" "Aug 26, 2013"
[841] "Aug 26, 2014" "Aug 26, 2016" "Aug 26, 2018" "Aug 27, 1997" "Aug 27, 1997" "Aug 27, 1999" "Aug 27, 2002" "Aug 27, 2003" "Aug 27, 2005" "Aug 27, 2008"
[851] "Aug 27, 2014" "Aug 27, 2014" "Aug 27, 2015" "Aug 27, 2017" "Aug 27, 2019" "Aug 28, 1995" "Aug 28, 1996" "Aug 28, 1997" "Aug 28, 1999" "Aug 28, 1999"
[861] "Aug 28, 2000" "Aug 28, 2002" "Aug 28, 2005" "Aug 28, 2005" "Aug 28, 2007" "Aug 28, 2008" "Aug 28, 2011" "Aug 28, 2011" "Aug 28, 2011" "Aug 28, 2012"
[871] "Aug 28, 2012" "Aug 28, 2013" "Aug 28, 2014" "Aug 28, 2014" "Aug 28, 2014" "Aug 28, 2016" "Aug 28, 2016" "Aug 28, 2017" "Aug 28, 2018" "Aug 28, 2018"
[881] "Aug 29, 1994" "Aug 29, 2000" "Aug 29, 2002" "Aug 29, 2005" "Aug 29, 2005" "Aug 29, 2007" "Aug 29, 2010" "Aug 29, 2012" "Aug 29, 2012" "Aug 29, 2013"
[891] "Aug 29, 2014" "Aug 29, 2016" "Aug 29, 2017" "Aug 29, 2017" "Aug 29, 2018" "Aug 3, 1999" "Aug 3, 2004" "Aug 3, 2004" "Aug 3, 2005" "Aug 3, 2005"
[901] "Aug 3, 2005" "Aug 3, 2007" "Aug 3, 2008" "Aug 3, 2008" "Aug 3, 2008" "Aug 3, 2008" "Aug 3, 2010" "Aug 3, 2014" "Aug 3, 2014" "Aug 3, 2014"
[911] "Aug 3, 2017" "Aug 30, 1994" "Aug 30, 1998" "Aug 30, 2000" "Aug 30, 2003" "Aug 30, 2004" "Aug 30, 2005" "Aug 30, 2006" "Aug 30, 2007" "Aug 30, 2007"
[921] "Aug 30, 2008" "Aug 30, 2009" "Aug 30, 2010" "Aug 30, 2015" "Aug 30, 2015" "Aug 30, 2015" "Aug 30, 2015" "Aug 30, 2015" "Aug 30, 2016" "Aug 30, 2017"
[931] "Aug 30, 2017" "Aug 30, 2018" "Aug 30, 2018" "Aug 31, 1995" "Aug 31, 1999" "Aug 31, 2000" "Aug 31, 2004" "Aug 31, 2005" "Aug 31, 2005" "Aug 31, 2009"
[941] "Aug 31, 2011" "Aug 31, 2013" "Aug 31, 2014" "Aug 31, 2015" "Aug 31, 2016" "Aug 31, 2016" "Aug 31, 2017" "Aug 31, 2017" "Aug 31, 2018" "Aug 4, 1995"
[951] "Aug 4, 1995" "Aug 4, 1999" "Aug 4, 2003" "Aug 4, 2003" "Aug 4, 2004" "Aug 4, 2005" "Aug 4, 2005" "Aug 4, 2006" "Aug 4, 2006" "Aug 4, 2007"
[961] "Aug 4, 2008" "Aug 4, 2012" "Aug 4, 2017" "Aug 4, 2017" "Aug 5, 1994" "Aug 5, 1995" "Aug 5, 1998" "Aug 5, 1999" "Aug 5, 2002" "Aug 5, 2003"
[971] "Aug 5, 2005" "Aug 5, 2005" "Aug 5, 2010" "Aug 5, 2010" "Aug 5, 2015" "Aug 5, 2015" "Aug 5, 2017" "Aug 5, 2017" "Aug 5, 2019" "Aug 5, 2019"
[981] "Aug 5, 2019" "Aug 6, 1993" "Aug 6, 1997" "Aug 6, 1998" "Aug 6, 2001" "Aug 6, 2002" "Aug 6, 2003" "Aug 6, 2004" "Aug 6, 2006" "Aug 6, 2006"
[991] "Aug 6, 2014" "Aug 6, 2015" "Aug 6, 2015" "Aug 6, 2016" "Aug 6, 2016" "Aug 6, 2017" "Aug 6, 2017" "Aug 6, 2017" "Aug 6, 2017" "Aug 7, 1993"
[ reached getOption("max.print") -- omitted 5461 entries ]
numPreDeDup<-nrow(pq_metaclean) #6461
print(paste("Number of articles pre-dedup:",numPreDeDup))
[1] "Number of articles pre-dedup: 6461"
numDup<-sum(table(pq_metaclean$`Full text`)-1) # Total number of duplicate full texts: 184
print(paste("Number of duplicates:",numDup))
[1] "Number of duplicates: 184"
# duplicate flag is not sufficient to remove all duplicates - 53 rows
pq_metadata %>%
select(`Full text`, `Publication info`, `Publication date`) %>%
mutate(`Publication date` = as.Date(`Publication date`, format = "%b %d, %Y")) %>%
arrange(`Publication date`) %>%
filter(grepl('Duplicate',`Publication info`))
pq_metaclean <- pq_metaclean %>%
filter(!grepl('Duplicate',`Publication info`)) %>%
group_by(`Full text`) %>%
mutate(`Publication date` = as.Date(`Publication date`, format = "%b %d, %Y")) %>%
arrange(`Publication date`) %>%
slice(1L)
pq_metaclean %>%
select(`Publication info`,`Publication date`)
Adding missing grouping variables: `Full text`
numPostDeDup<-nrow(pq_metaclean)
print(paste("Number of articles post-dedup:",numPostDeDup)) # 6277 to 6239
[1] "Number of articles post-dedup: 6239"
numRemoved <- (numPreDeDup - numPostDeDup)
print(paste("Number of duplicates removed:",numRemoved)) # 184 to 222
[1] "Number of duplicates removed: 222"
# Save cleaned corpus to new file
outputFileNameClean <- paste0(rFileNum,"_pq_metaclean")
outputFolder = "Data/02_Working/"
outputFileClean = paste(outputFolder,outputFileNameClean,".csv",sep="")
if (!file.exists(outputFileClean) | overwrite) {
write.csv(pq_metaclean, outputFileClean, row.names=FALSE)
}